import pandas as pd
#provaxxers
provaxxers = pd.read_csv('./datasets/kdmile/provaxxers.csv', low_memory=False)
#antivaxxers
#antivaxxers = pd.read_csv('./datasets/kdmile/antivaxxers.csv', low_memory=False)
docs = provaxxers
from umap import UMAP
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
umap_model = UMAP(n_neighbors=15, n_components=10, min_dist=0.0, metric='cosine')
from bertopic import BERTopic
topic_model = BERTopic(language = 'multilingual',
top_n_words=10,
n_gram_range=(1, 2),
min_topic_size=45,
nr_topics = 'auto',
umap_model=umap_model,
low_memory=True,
calculate_probabilities=False,
verbose=True)
topics, probs = topic_model.fit_transform(docs.text)
2021-08-07 13:11:55,060 - BERTopic - Transformed documents to Embeddings 2021-08-07 13:21:50,229 - BERTopic - Reduced dimensionality with UMAP 2021-08-07 13:22:10,692 - BERTopic - Clustered UMAP embeddings with HDBSCAN 2021-08-07 13:22:44,805 - BERTopic - Reduced number of topics from 423 to 239
topic_model.visualize_topics()
newTopics, newProbs = topic_model.reduce_topics(docs.text, topics, probs, nr_topics=9)
2021-08-07 13:23:40,364 - BERTopic - Reduced number of topics from 239 to 10
topic_model.visualize_topics()
timestamps = docs.created_at.to_list()
tweets = docs.text.to_list()
topics_over_time = topic_model.topics_over_time(docs=tweets,
topics=newTopics,
timestamps=timestamps,
global_tuning=True,
evolution_tuning=True,
nr_bins=20)
20it [01:23, 4.19s/it]
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=30)